Part 1. Problem Set Questions¶
The purpose of this project was to analyze the features a dataset, containig timestamps from turnstyles of the NYC Subway system, which was combined with data from Weather Underground from the same time period to see if there is a significant difference in subway ridership on rainy and non-rainy days. The readings are stored in the file in four hour bins, with totals on entry and exit counts per bin, per Unit ID, which is effectively the same as a station ID. The dataset includes several other features related to the station and weather conditions including fog, and wind conditions, and latitude and longitude of the station and weather reading. Another goal is to explore the data to find and show other interesting features that show a correlation with subway ridership, and report some other interesting findings.
PS 2.0 Initial setup and data input¶
import csv
import pandas as pd
import numpy as np
import datetime
import pandasql
import matplotlib.pyplot as plt
%matplotlib inline
weather_data = pd.read_csv('turnstile_weather_v2.csv')
orig_weather_data = pd.read_csv('turnstile_data_master_with_weather.csv')
print "weather_data columns: "
print weather_data.columns.values
print "orig_weather_data columns: "
print orig_weather_data.columns.values
PS 2.1 Number of Rainy Days¶
This function should return the count of days whenit rained, indicated by rain = 1.
def num_rainy_days(data):
'''
Selects a count of the number of rows indicate rain = 1, or a boolean rainy day indicator
'''
q = "SELECT count(*) from data where rain = 1"
#Execute your SQL command against the pandas frame
rainy_days = pandasql.sqldf(q.lower(), locals())
return rainy_days
#Test
print 'Test Results:'
print num_rainy_days(weather_data)
PS 2.2 Temp on Foggy and Nonfoggy Days¶
This function should return the maximum temperature in the tempi column for days when it was foggy, and not foggy indicated by the column named fog = 0 or 1.
def max_temp_aggregate_by_fog(data):
'''
Returns two rows - whether it was foggy or not (0 or 1) and the max
maxtempi for that fog value (i.e., the maximum max temperature
for both foggy and non-foggy days).
'''
q= "SELECT fog,max(tempi) as maxtempi from data group by fog;"
#Execute your SQL command against the pandas frame
foggy_days = pandasql.sqldf(q.lower(), locals())
return foggy_days
#Test
print 'Test Results:'
print max_temp_aggregate_by_fog(weather_data)
PS 2.3 Mean Temp on Weekends¶
This function should return the average mean temperature on weekend days. For the test we used the original weather data, which includes the meantempi column, containing mean temperatures.
def avg_weekend_temperature(data):
'''
Returns the average meantempi on days that are a Saturday
or Sunday.
'''
data['date'] = pd.to_datetime(data['DATEn'])
q = """SELECT avg(meantempi)
from data
where cast (strftime('%w', date) as integer) == 0
OR cast (strftime('%w', date) as integer) == 6;"""
#Execute your SQL command against the pandas frame
mean_temp_weekends = pandasql.sqldf(q.lower(), locals())
return mean_temp_weekends
orig_weather_data = pd.read_csv('turnstile_data_master_with_weather.csv')
print 'Test Results:'
print avg_weekend_temperature(orig_weather_data)
PS 2.4 Mean Temp on Rainy Days¶
This function returns the average minimum temperature on days where it rains and the minimum temperature in the mintempi column is above 55 degrees.
def avg_min_temperature(data):
'''
Returns the average minimum temperature (mintempi column of the weather dataframe) on
rainy days where the minimum temperature is greater than 55 degrees.
'''
#print data.head(5)
q = """select avg(mintempi)
from data
where mintempi > 55
and rain=1;"""
#Execute your SQL command against the pandas frame
avg_min_temp_rainy = pandasql.sqldf(q.lower(), locals())
return avg_min_temp_rainy
#Test
orig_weather_data = pd.read_csv('turnstile_data_master_with_weather.csv')
print 'Test Results:'
print avg_min_temperature(orig_weather_data)
PS 2.5 Fixing Turnstile Data¶
This function reads turnstile files in their original format, and breaks lines with multiple entries per line for each turnstile into a file with one entry per line with equal column counts.
def fix_turnstile_data(filenames):
'''
Reads input files from a set of filenames and updates each row in the text
file so there is only one entry per row. Then write the updates to a different text
file in the format of "updated_" + filename.
'''
for name in filenames:
# your code here
f_out = open("updated_"+name,'w')
with open(name, 'rb') as f:
reader = csv.reader(f, delimiter=',')
writer = csv.writer(f_out, delimiter=',')
for row in reader:
sample = [row[0] , row[1] , row[2]]
for i in range(3,len(row),5):
if (i + 4) <= len(row):
curr_row = sample + [row[i] ,row[i+1] , row[i+2] , row[i+3], row[i+4] ]
writer.writerow(curr_row)
f_out.close
#Test
print 'Test Results:'
fix_turnstile_data(['turnstile_110507.txt','turnstile_110528.txt'])
f1=pd.read_csv('updated_turnstile_110507.txt')
f2=pd.read_csv('updated_turnstile_110507.txt')
print f1.head(5)
print f2.head(5)
PS 2.6 Combining Turnstile Data¶
This function reads a list of filenames and combines the rows from each file into a new file, with one header on the first line.
def create_master_turnstile_file(filenames, output_file):
'''
Reads files from a list of input filenames, which all have the
columns 'C/A, UNIT, SCP, DATEn, TIMEn, DESCn, ENTRIESn, EXITSn', and consolidates
them into one file located at output_file.
'''
with open(output_file, 'w') as master_file:
master_file.write('C/A,UNIT,SCP,DATEn,TIMEn,DESCn,ENTRIESn,EXITSn\n')
for filename in filenames:
# your code here
with open(filename, 'rb') as f:
reader = csv.reader(f, delimiter=',')
writer = csv.writer(master_file, delimiter=',')
for row in reader:
writer.writerow(row)
f.close
master_file.close
create_master_turnstile_file(['updated_turnstile_110507.txt','updated_turnstile_110528.txt'],'combined_turnstile_files.txt')
print 'Test Results:'
f1=pd.read_csv('combined_turnstile_files.txt')
f1.head(5)
PS 2.7 Filtering Irregular Data¶
This function reads a file and only regurns rows where the DESCn column is equal to the value 'REGULAR'.
def filter_by_regular(filename):
'''
This function reads the csv file located at filename into a pandas dataframe,
and filter the dataframe to only rows where the 'DESCn' column has the value 'REGULAR'.
'''
turnstile_data = pd.read_csv(filename)
turnstile_data = turnstile_data[turnstile_data['DESCn']=='REGULAR']
return turnstile_data
#Test
print 'Test Results:'
filter_by_regular('combined_turnstile_files.txt').head(5)
PS 2.8 Get Hourly Entries¶
This function takes a dataframe as input, and it adds a derrived column called ENTRIESn_hourly that calculates the number of entries houlry by subtracting the value ENTRIESn of the record from the previous hour from the next entry to get an hourly total.
def get_hourly_entries(df):
'''
This function changes these cumulative entry numbers to a count of entries since the last reading
and stores the value in a column named ENTRIESn_hourly
(i.e., entries since the last row in the dataframe).
'''
#your code here
df_prev = df['ENTRIESn'].shift(1)
df['ENTRIESn_hourly'] = df['ENTRIESn'] - df_prev
#df['ENTRIESn_hourly'].iloc(0).replace(1)
#print df.head(4).fillna(1)
return df.fillna(1)
#Test
print 'Test Results:'
data = pd.read_csv('combined_turnstile_files.txt')
get_hourly_entries(data).head(5)
PS 2.9 Get Hourly Exits¶
This function calculates the number of exists since the last reading by subtracting the previous row's EXISTSn value from the same column in the current row, and adds this value under a column named EXITSn_hourly.
def get_hourly_exits(df):
'''
This function changes the cumulative exit numbers to a count of exits since the last reading
and stores the result in a new column named EXITSn_hourly.
'''
df_prev = df['EXITSn'].shift(1)
df['EXITSn_hourly'] = df['EXITSn'] - df_prev
return df.fillna(0)
#Test
print 'Test Results:'
data = pd.read_csv('combined_turnstile_files.txt')
print get_hourly_exits(data).head(5)
PS 2.10 Time to Hour¶
This function converts a time format with a two digit hour into a single integer value, removing any most significant digits that are zero, to return only a single digit number if the value is less than 10.
def time_to_hour(time):
'''
Given an input variable time that represents time in the format of:
"00:00:00" (hour:minutes:seconds)
returns the hour value as an integer without any leading zero.
'''
hour = int(time.split(':')[0])
return hour
#Test
print 'Input Values:'
print '10:35:00'
print '22:55:00'
print '02:12:00'
print '\nTest Results:'
print time_to_hour('10:35:00')
print time_to_hour('22:55:00')
print time_to_hour('02:12:00')
PS 2.11 Reformat Subway Dates¶
This function should take a value in the MTA Subway date format mm-dd-yy and returns an equivilent date in the Weather Underground date format YYYY-mm-dd, so datasets from both sources can be joined and compared.
def reformat_subway_dates(date):
'''
Takes MTA Subway date format as input and returns Weather Underground date format.
'''
date_formatted = datetime.datetime.strptime(date,"%m-%d-%y").strftime("%Y-%m-%d") # your code here
return date_formatted
#Test
print 'Test Results:'
print 'Input Values:'
print '10-09-15'
print '02-28-13'
print '06-01-12'
print '\nTest Results:'
print reformat_subway_dates('10-09-15')
print reformat_subway_dates('02-28-13')
print reformat_subway_dates('06-01-12')
PS 3.1 Exploratory Data Analysis¶
def entries_histogram(turnstile_weather):
'''
Plots two histograms on the same axes to show hourly
entries when raining vs. when not raining.
'''
plt.figure()
# your code here to plot a historgram for hourly entries when it is raining
turnstile_weather['ENTRIESn_hourly'][turnstile_weather['rain']==0] \
.plot(kind='hist', stacked=True, bins=50, color='green')
# your code here to plot a historgram for hourly entries when it is not raining
turnstile_weather['ENTRIESn_hourly'][turnstile_weather['rain']==1] \
.plot(kind='hist', stacked=True, bins=50, color='blue')
return plt
#Test
print 'Test Results:'
entries_histogram(weather_data).show()
PS 3.2 Welch's T-Test?¶
Should the Welch's T-Test be used to compare the difference in these two sets?
The histogram above shows that the data is not normally distributed, therefore a the Welch's t-test is not an appropriate test of the two samples.
The Mann Whitney U-test can be used to test non-normalaly distributed sets like these.
PS 3.3 Mann-Whitney U-Test¶
import scipy
import scipy.stats
def mann_whitney_plus_means(turnstile_weather):
'''
This function consumes the turnstile_weather dataframe containing
our final turnstile weather data.
dataframe.
This function returns:
1) the mean of entries with rain
2) the mean of entries without rain
3) the Mann-Whitney U-statistic and p-value comparing the number of entries
with rain and the number of entries without rain
'''
with_rain = turnstile_weather['ENTRIESn_hourly'][turnstile_weather['rain']==1]
without_rain = turnstile_weather['ENTRIESn_hourly'][turnstile_weather['rain']==0]
with_rain_mean = np.mean(with_rain)
without_rain_mean = np.mean(without_rain)
U,p = scipy.stats.mannwhitneyu(with_rain, without_rain)
return with_rain_mean, without_rain_mean, U, p # leave this line for the grader
print 'Test Results:'
print mann_whitney_plus_means(weather_data)
PS 3.4 Ridership on Rainy vs. Nonrainy Days¶
The Mann Whitney U-Test shows that the distributions are statistically different because of the really low value of the p-statistic in it's result of 2.74 x 10^-6 or 0.00000274. Since this is below the 95% CI p-value of 0.05 we can assume that there is a difference between subway ridership on rainy and non-rainy days.
PS 3.5 Linear Regression¶
import statsmodels.api as sm
def linear_regression(features, values):
"""
Performs linear regression given a data set with an arbitrary number of features.
"""
features = sm.add_constant(features)
model = sm.OLS(values,features)
results = model.fit()
intercept = results.params[0]
params = results.params[1:]
return intercept, params
def predictions(dataframe):
'''
The NYC turnstile data is stored in a pandas dataframe called weather_turnstile.
Using the information stored in the dataframe. This function predicts the ridership of
the NYC subway using linear regression with gradient descent.
'''
dataframe['day_of_week'] = pd.to_datetime(dataframe['DATEn'])
dataframe['day_of_week'] = dataframe['day_of_week'].dt.dayofweek
features = dataframe[['rain','day_of_week','hour']]
dummy_units = pd.get_dummies(dataframe['UNIT'], prefix='unit')
features = features.join(dummy_units)
# Values
values = dataframe['ENTRIESn_hourly']
# Perform linear regression
intercept, params = linear_regression(features, values)
predictions = intercept + np.dot(features, params)
# print intercept
# print params
return predictions
print 'Test Results: '
weather_data = pd.read_csv('turnstile_weather_v2.csv')
print predictions(weather_data)
PS 3.6 Plotting Residuals¶
import matplotlib.pyplot as plt
def plot_residuals(turnstile_weather, predictions):
'''
Plots a histogram of the residuals between the original hourly entry data and the predicted values.
'''
plt.figure()
(turnstile_weather['ENTRIESn_hourly'] - predictions).hist(bins=50)
return plt
print 'Test Results:'
weather_data = pd.read_csv('turnstile_weather_v2.csv')
plot_residuals(weather_data,predictions(weather_data)).show()
PS 3.7 Compute R^2¶
import sys
def compute_r_squared(data, predictions):
'''
This function computes and return the coefficient of determination (R^2)
for the input data vs. input predictions.
'''
n = len(data)
rss = np.sum(np.power((data - predictions), 2))
tss = np.sum(np.power((data - np.mean(data)), 2))
# calculate the R Squared value
r_squared = 1 - (rss / tss)
return r_squared
print 'Test Results: '
weather_data = pd.read_csv('turnstile_weather_v2.csv')
print compute_r_squared(weather_data['ENTRIESn_hourly'],predictions(weather_data))
PS 3.8 Gradient Descent¶
from sklearn.linear_model import SGDRegressor
"""
In this question, you need to:
1) Implement the linear_regression() procedure using gradient descent.
You can use the SGDRegressor class from sklearn, since this class uses gradient descent.
2) Select features (in the predictions procedure) and make predictions.
"""
def normalize_features(features):
'''
Returns the means and standard deviations of the given features, along with a normalized feature
matrix.
'''
means = np.mean(features, axis=0)
std_devs = np.std(features, axis=0)
normalized_features = (features - means) / std_devs
return means, std_devs, normalized_features
def recover_params(means, std_devs, norm_intercept, norm_params):
'''
Recovers the weights for a linear model given parameters that were fitted using
normalized features. Takes the means and standard deviations of the original
features, along with the intercept and parameters computed using the normalized
features, and returns the intercept and parameters that correspond to the original
features.
'''
intercept = norm_intercept - np.sum(means * norm_params / std_devs)
params = norm_params / std_devs
return intercept, params
def linear_regression(features, values):
"""
Perform linear regression given a data set with an arbitrary number of features.
"""
clf = SGDRegressor(n_iter=50)
clf.fit(features, values)
intercept = clf.intercept_
params = clf.coef_
return intercept, params
def predictions_SGD(dataframe):
'''
This function predicts the ridership of the NYC subway using linear regression with gradient
descent using certain variables from the NYC transit weather data dataframe.
'''
dataframe['day_of_week'] = pd.to_datetime(dataframe['DATEn'])
dataframe['day_of_week'] = dataframe['day_of_week'].dt.dayofweek
features = dataframe[['rain','day_of_week','hour']]
dummy_units = pd.get_dummies(dataframe['UNIT'], prefix='unit')
features = features.join(dummy_units)
# Values
values = dataframe['ENTRIESn_hourly']
# Get numpy arrays
features_array = features.values
values_array = values.values
means, std_devs, normalized_features_array = normalize_features(features_array)
# Perform gradient descent
norm_intercept, norm_params = linear_regression(normalized_features_array, values_array)
intercept, params = recover_params(means, std_devs, norm_intercept, norm_params)
predictions = intercept + np.dot(features_array, params)
return predictions
weather_data = pd.read_csv('turnstile_weather_v2.csv')
print 'Test Results:'
print 'R^2: ' + str(compute_r_squared(weather_data['ENTRIESn_hourly'],predictions_SGD(weather_data)))
plot_residuals(weather_data,predictions_SGD(weather_data)).show()
print predictions_SGD(weather_data)
PS 4.1 Visualization 1¶
This visualization plots a graph showing mean ridership by hour for all rainy and non-rainy days.
import matplotlib.pyplot as plt
from datetime import datetime as dt
def plot_entries_per_hour(turnstile_weather):
'''
Plots a graph showing mean ridership by hour each day on rainy and non-rainy days.
'''
turnstile_weather['day_of_week'] = pd.to_datetime(turnstile_weather['DATEn'])
turnstile_weather['day_of_week'] = turnstile_weather['day_of_week'].dt.dayofweek
rainy = {0: 'non-rainy', 1: 'rainy'}
turnstile_weather['rainy'] = turnstile_weather['rain'].apply(lambda x: rainy[x])
turnstile_weather['time'] = turnstile_weather['TIMEn'].apply(lambda x: dt.strptime(x,'%H:%M:%S'))
df2 = turnstile_weather.groupby(['time','rainy'])['ENTRIESn_hourly'].mean().reset_index()
df2_rain = df2[df2['rainy']=='rainy']
df2_no_rain = df2[df2['rainy']=='non-rainy']
rain, = plt.plot(df2_rain['time'],df2_rain['ENTRIESn_hourly'], 'b-')
no_rain, = plt.plot(df2_no_rain['time'],df2_no_rain['ENTRIESn_hourly'], 'r-')
plt.title('Hourly Ridership by Time of Day')
plt.xlabel('Time of Day')
plt.ylabel('Average Daily Rider Count')
plt.yticks(rotation=0)
plt.xticks(rotation=-90)
plt.legend([rain,no_rain], ['Rain', 'No Rain'])
return plt
weather_data = pd.read_csv('turnstile_weather_v2.csv')
plot_entries_per_hour(weather_data).show()
PS 4.2 Visualization 2¶
This visualization shows the mean ridership for all stations each day of the week.
from ggplot import *
import datetime
def plot_entries_per_weekday(turnstile_weather):
'''
Plots a graph showing the mean number of subway entries by weekday on rainy and non-rainy days
'''
turnstile_weather['day_of_week'] = pd.to_datetime(turnstile_weather['DATEn'])
turnstile_weather['day_of_week'] = turnstile_weather['day_of_week'].dt.dayofweek
rainy = {0: 'non-rainy', 1: 'rainy'}
turnstile_weather['rainy'] = turnstile_weather['rain'].apply(lambda x: rainy[x])
turnstile_weather = turnstile_weather.groupby(['rainy','day_of_week'])['ENTRIESn_hourly'].mean()
turnstile_weather = turnstile_weather.reset_index()
#print turnstile_weather
plot = ggplot(turnstile_weather, aes(x='day_of_week', y='ENTRIESn_hourly', color='rainy',fill='rainy')) + \
geom_point() + \
stat_smooth() + \
xlab('Day of Week') + \
ylab('Mean Number of Entries Hourly') + \
scale_x_discrete(breaks = [0,1,2,3,4,5,6], \
labels = ['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday']) + \
ggtitle('Mean Hourly Entries on Each Day of the Week')
return plot
weather_data = pd.read_csv('turnstile_weather_v2.csv')
plot_entries_per_weekday(weather_data)
Part 2 Questions:¶
Section 0. References¶
Please include a list of references you have used for this project. Please be specific - for example, instead of including a general website such as stackoverflow.com, try to include a specific topic from Stackoverflow that you have found useful.
- http://blog.yhathq.com/posts/aggregating-and-plotting-time-series-in-python.html
- https://georgemdallas.wordpress.com/2013/10/30/principal-component-analysis-4-dummies-eigenvectors-eigenvalues-and-dimension-reduction/
- https://en.wikipedia.org/wiki/Welch%27s_t_test
- http://scikit-learn.org/stable/modules/sgd.html
- http://docs.ggplot2.org/current/scale_continuous.html
- http://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.mannwhitneyu.html
- http://stats.stackexchange.com/questions/36064/calculating-r-squared-coefficient-of-determination-with-centered-vs-un-center
- http://stats.stackexchange.com/questions/110380/smoother-lines-for-ggplot2
- http://matplotlib.org/users/pyplot_tutorial.html
- http://blog.yhathq.com/posts/ggplot-for-python.html
- http://matplotlib.org/examples/index.html
- http://chrisalbon.com/python/pandas_apply_operations_to_groups.html
- https://gehrcke.de/2013/07/data-analysis-with-pandas-enjoy-the-awesome/
- http://matplotlib.org/examples/pylab_examples/date_demo_convert.html
- http://matplotlib.org/1.3.0/examples/pylab_examples/legend_demo.html
- http://stackoverflow.com/questions/332289/how-do-you-change-the-size-of-figures-drawn-with-matplotlib
- https://github.com/adam-p/markdown-here/wiki/Markdown-Cheatsheet
- http://strftime.org/
Section 1. Statistical Test¶
1.1 Which statistical test did you use to analyze the NYC subway data?¶
Because of the results shown in the histogram of entry counts over time, we can see that the sets are not normally distributed. In fact it's positively skewed, meaning the most of the samples occour in the lower half of the histogram.
Becaue the Welch's T-test is not appropriate for testing non-normally distributed data, we performed the Mann Whitney U-Test, wich can compare two sets of data regardless of the distribution skew.
- Did you use a one-tail or a two-tail P value?
For this test we are using a two-tailed P value because ridership could be either greater or less on rainy days than clear days. We don't have any determination that ridership will be greater or less on rainy days, so we're performing a two-tailed test.
- What is the null hypothesis?
In this test we are considering the null hypothosis is that ridership on the NYC subway in terms of entries per hour is not significantly different on days where there is rainy weather, when compared with days where there is clear weather.
- What is your p-critical value?
For a two-taled test of 95% significance we use a p-critical value of 0.05. This means if the value of the p-value returned by the Mann Whitney U-test is less than 0.05, then there is a significant differnce in ridership between the rany and non-rainy sets.
1.2 Why is this statistical test applicable to the dataset? In particular, consider the assumptions that the test is making about the distribution of ridership in the two samples.¶
Becaue the Welch's T-test is not appropriate for testing non-normally distributed data, we performed the Mann Whitney U-Test, wich can compare two sets of data regardless of the distribution skew.
1.3 What results did you get from this statistical test? These should include the following numerical values: p-values, as well as the means for each of the two samples under test.¶
For a two-taled test of 95% significance test we use a p-critical value of 0.05. This means if the value of the p-value returned by the Mann Whitney U-test is less than 0.05, then there is a significant differnce in ridership between the rany and non-rainy sets.
1.4 What is the significance and interpretation of these results?¶
In the U-test, we got a U value of 153635120.5 and a p value of 2.74 x 10^-6, or approximately 0.00003, well below the 0.05 p-critical. This means that the samples are discernably different with 95% significance.
Section 2. Linear Regression¶
2.1 What approach did you use to compute the coefficients theta and produce prediction for ENTRIESn_hourly in your regression model:¶
I made my first prediction using Statsmodels OLS, but also used Scikit Gradient Descent to make an additional prediction for comparison.
2.2 What features (input variables) did you use in your model? Did you use any dummy variables as part of your features?¶
I used rain, time of day, and Day of week as features.
2.3 Why did you select these features in your model? We are looking for specific reasons that lead you to believe that the selected features will contribute to the predictive power of your model. Your reasons might be based on intuition. For example, response for fog might be: “I decided to use fog because I thought that when it is very foggy outside people might decide to use the subway more often.” Your reasons might also be based on data exploration and experimentation, for example: “I used feature X because as soon as I included it in my model, it drastically improved my R2 value.”¶
I computed day of week because I could see that certain days were obviously busier than others and thought there could be something going on there. Once grouping by day of week, it's apparent this is a good predictor of subway ridership, probably because the majority of the people share a similar work schedule.
I actually chose these purely by trial and error. I think there is probably a way to plot these and get an idea if there seems to be a visible trend between two variables, but I actually plugged in values trying to get a better R^2, without getting a lot of luck with most of the other weather indicators. Rain helped, but wasn't a good enough predictor on it's own. I think UNIT would have been a good one, since some stations are obviously busier than others, but I didn't have any luck plugging it in as a feature, probably because it's the wrong type.
2.4 What are the parameters (also known as "coefficients" or "weights") of the non-dummy features in your linear regression model?¶
From the linear regression function I got the following values for params:
- rain = 117.225179
- day_of_week = -141.600626
- hour = 123.307487
2.5 What is your model’s R2 (coefficients of determination) value?¶
For my linear regression model the value of R^2 is:
0.469
2.6 What does this R2 value mean for the goodness of fit for your regression model? Do you think this linear model to predict ridership is appropriate for this dataset, given this R2 value?¶
I think it's a fairly good model that could probably be improved by using the right combination of features, or additional derrived data elements. I did find that adding additional features that weren't good indicators of ridership only worsened the results.
Section 3. Visualization¶
Please include two visualizations that show the relationships between two or more variables in the NYC subway data. Remember to add appropriate titles and axes labels to your plots. Also, please add a short description below each figure commenting on the key insights depicted in the figure.
3.1 One visualization should contain two histograms: one of ENTRIESn_hourly for rainy days and one of ENTRIESn_hourly for non-rainy days. You can combine the two histograms in a single plot or you can use two separate plots. If you decide to use to two separate plots for the two histograms, please ensure that the x-axis limits for both of the plots are identical. It is much easier to compare the two in that case. For the histograms, you should have intervals representing the volume of ridership (value of ENTRIESn_hourly) on the x-axis and the frequency of occurrence on the y-axis. For example, each interval (along the x-axis), the height of the bar for this interval will represent the number of records (rows in our data) that have ENTRIESn_hourly that falls in this interval. Remember to increase the number of bins in the histogram (by having larger number of bars). The default bin width is not sufficient to capture the variability in the two samples.¶
import matplotlib.patches as mpatches
def entries_histogram(turnstile_weather):
'''
Plots two histograms on the same axes to show hourly
entries when raining vs. when not raining.
'''
turnstile_weather['ENTRIESn_hourly'][turnstile_weather['rain']==0] \
.plot(kind='hist', stacked=True, bins=50, color='green')
turnstile_weather['ENTRIESn_hourly'][turnstile_weather['rain']==1] \
.plot(kind='hist', stacked=True, bins=50, color='blue')
no_rain = mpatches.Patch(color='green', label='No Rain')
rain = mpatches.Patch(color='blue', label='Rain')
plt.title('Histogram Showing Occourences of Levels of Ridership')
plt.xlabel('Volume of Ridership')
plt.ylabel('Number of Log Entries')
plt.legend([rain,no_rain], ['Rain', 'No Rain'])
return plt
weather_data = pd.read_csv('turnstile_weather_v2.csv')
entries_histogram(weather_data).show()
3.2 One visualization can be more freeform. You should feel free to implement something that we discussed in class (e.g., scatter plots, line plots) or attempt to implement something more advanced if you'd like. Some suggestions are:¶
- Ridership by time-of-day
- Ridership by day-of-week
This visualization shows ridership by date and time for the entire length of the dataset. It also shows entries on rainy days in a different color to visualize times when it was rainy and times when it was not.
import matplotlib.pyplot as plt
from datetime import datetime as dt
def plot_entries_per_day_by_longitudue(turnstile_weather):
'''
Plots a graph showing mean ridership by hour each day on rainy and non-rainy days.
'''
turnstile_weather['day_of_week'] = pd.to_datetime(turnstile_weather['DATEn'])
turnstile_weather['day_of_week'] = turnstile_weather['day_of_week'].dt.dayofweek
rainy = {0: 'non-rainy', 1: 'rainy'}
turnstile_weather['rainy'] = turnstile_weather['rain'].apply(lambda x: rainy[x])
#turnstile_weather['time'] = turnstile_weather['TIMEn'].apply(lambda x: dt.strptime(x,'%H:%M:%S'))
df2 = turnstile_weather.groupby(['longitude','rainy'])['ENTRIESn_hourly'].mean().reset_index()
df2_rain = df2[df2['rainy']=='rainy']
df2_no_rain = df2[df2['rainy']=='non-rainy']
rain, = plt.plot(df2_rain['longitude'],df2_rain['ENTRIESn_hourly'], 'bo')
no_rain, = plt.plot(df2_no_rain['longitude'],df2_no_rain['ENTRIESn_hourly'], 'ro')
plt.legend([rain,no_rain], ['Rain', 'No Rain'])
plt.title('Mean Ridership by Longitude')
plt.xlabel('Longitude')
plt.ylabel('Average Daily Rider Count')
plt.yticks(rotation=0)
plt.xticks(rotation=-90)
plt.legend([rain,no_rain], ['Rain', 'No Rain'])
return plt
def plot_entries_per_day_by_latitudue(turnstile_weather):
'''
Plots a graph showing mean ridership by hour each day on rainy and non-rainy days.
'''
turnstile_weather['day_of_week'] = pd.to_datetime(turnstile_weather['DATEn'])
turnstile_weather['day_of_week'] = turnstile_weather['day_of_week'].dt.dayofweek
rainy = {0: 'non-rainy', 1: 'rainy'}
turnstile_weather['rainy'] = turnstile_weather['rain'].apply(lambda x: rainy[x])
#turnstile_weather['time'] = turnstile_weather['TIMEn'].apply(lambda x: dt.strptime(x,'%H:%M:%S'))
df2 = turnstile_weather.groupby(['latitude','rainy'])['ENTRIESn_hourly'].mean().reset_index()
df2_rain = df2[df2['rainy']=='rainy']
df2_no_rain = df2[df2['rainy']=='non-rainy']
rain, = plt.plot(df2_rain['latitude'],df2_rain['ENTRIESn_hourly'], 'bo')
no_rain, = plt.plot(df2_no_rain['latitude'],df2_no_rain['ENTRIESn_hourly'], 'ro')
plt.title('Mean Ridership by Latitude')
plt.xlabel('Latitude')
plt.ylabel('Average Daily Rider Count')
plt.yticks(rotation=0)
plt.xticks(rotation=-90)
plt.legend([rain,no_rain], ['Rain', 'No Rain'])
return plt
weather_data = pd.read_csv('turnstile_weather_v2.csv')
plot_entries_per_day_by_longitudue(weather_data).show()
plot_entries_per_day_by_latitudue(weather_data).show()
Section 4. Conclusion¶
Please address the following questions in detail. Your answers should be 1-2 paragraphs long.
4.1 From your analysis and interpretation of the data, do more people ride the NYC subway when it is raining or when it is not raining?¶
From my analysis here I would say that on days that it is raining, sigificantly more riders do ride the subway. However most days are not rainy days.
4.2 What analyses lead you to this conclusion? You should use results from both your statistical tests and your linear regression to support your analysis.¶
It appears that on rainy days the mean ridership is usually not much greater than non-rainy days. Referencing the chart in PS 4.2, the difference seems greatest after mid-week when ridership is at it's peak, and least on weekends when ridership tapers to it's lowest levels.
Referencing the chart in PS 4.1, which shows mean ridership throughout the day, it also seems that mean ridership is usually close to the same between rainy and non-rainy days in the morning when people would be commuting to work, but this starts to diverge to favor more ridership in the afternoons on days when it is rainy, and less in the afternoons on days when it's not rainy. I imagine many people decide to walk home on clear days, but most people who commute on the subway in the morning, usually commute via the subway regardless of the weather conditions.
The reason I determined the difference is signficant is becasue our calculated p-value with the Mann-Whitney U-Test was well below the p-critical value of 0.05 with a p-value of 0.00003. Further analysis also shows that on a given weekday, or a given time of day, mean ridership will usually be slighly higher on a rainy day, than a non-rainy day.
Section 5. Reflection¶
Please address the following questions in detail. Your answers should be 1-2 paragraphs long.
5.1 Please discuss potential shortcomings of the methods of your analysis, including:¶
- Dataset,
- Analysis, such as the linear regression model or statistical test
This dataset lacked resolution in the time of day, so it was hard to see distinct patterns throughout the day. Instead we get four-hour snapshots throughout the day. Because there are only a handfull of samples throughout the day, it makes hourly projections difficult and unclear.
Our technique here could be a little bit more accurate with higher resolution data. The issue is we are marking some samples as a rainy day, when in actuality it only rained for part of the day. On these days we're only getting a partial day's worth of samples, and calling it a rainy day, and likewise we're getting a partial day's non-rainy samples for the same day and also calling it a non-rainy day. I think if we had higher frequency entries, we could better judge rates of ridership by hour and make better short-term and partial-day projections. If we had more data it would be best to either discard days that were partially rainy, or develop a function to return a factor for a partial day's results.
Another issue was that there was not much more than one month's worth of data, so it makes projections based on seasonal trends impossible. In fact it's hard to say with much confidence the difference in rainy verses non-rainy ridership because there just arean't many days in our data, which are labeled rainy days.
Linear regression and gradient descent produced similar results with similar input features. It seemed like one or the other might perform slightly better with a given set of input features. Given this dataset, I think it would be difficult to say that one algorithm consistently performed better than the other. I think it would depend on the training data and the features selected. With the current data and features it appears the Linear Regression method gets slightly better results.
5.2 (Optional) Do you have any other insight about the dataset that you would like to share with us?¶
I expected a higher predictive value in the latitude and longitude features since UNIT location seems to be a large component in subway ridership, but they didn't seemt to improve the result much and, in fact, they lowered the R^2 score when added. By themselves they didn't seem to be great features to use for gradient descent or linear regression. I'm still not sure why.
Below I created what I thought could be an interesting visualization showing ridership based on latitude and longitude, but I didn't convert latitude and longitude to cartesian coordinates because I wasn't sure how to do it correctly. I thought for a spatial area as small as NYC, with data as spotty as we had, it might not matter anyway. I'm also uncertain if the surface plot was the best choice, but I'm still learning matplotlib, and it was the most interesting type I could get to work with this data.
It created an interesting graph, which seems to show ridership is greatest at certain stations near the center of the area covered by the population and lower near the edges. I think if I had better experience with matplotlib, I could make it into a better visualization.
I think more data would definately be helpful, both a higher level of detail time and weather-wise, and also a longer time-period of sample data. This does highlight, though just how much data is needed to get good predictive results and to find interesting combinations of features to make better predictions or assertions about the data. I think just to get started I would want hourly samples from all stations for a four-month time-period. I think this is a factor of 16 times more rows of data.
from matplotlib.dates import WeekdayLocator, HourLocator, DateFormatter, DayLocator, SUNDAY, MONDAY
def plot_entries_per_hour2(turnstile_weather):
'''
This function plots total hourly ridership for all locations over time differentiating
ridership on rainy days vs non-rainy days.
'''
rainy = {0: 'non-rainy', 1: 'rainy'}
turnstile_weather['rainy'] = turnstile_weather['rain'].apply(lambda x: rainy[x])
turnstile_weather['datetime'] = pd.to_datetime(turnstile_weather['datetime'])
#turnstile_weather['date'] = pd.to_datetime(turnstile_weather['DATEn'])
#turnstile_weather['time'] = turnstile_weather['TIMEn'].apply(lambda x: dt.strptime(x,'%H:%M:%S'))
turnstile_weather = turnstile_weather.groupby(['datetime','rainy'])['ENTRIESn_hourly'].sum().reset_index()
rainy_weather = turnstile_weather[turnstile_weather['rainy']=='rainy']
nonrainy_weather = turnstile_weather[turnstile_weather['rainy']=='non-rainy']
fig, ax = plt.subplots()
fig.set_size_inches(18.5, 7.5, forward=True)
plt.title('Total Hourly Ridership by Weather and Time of Day')
plt.xlabel('Time of Day')
plt.ylabel('Average Daily Rider Count')
plt.yticks(rotation=0)
plt.xticks(rotation=-90)
ax.xaxis_date()
ax.xaxis.set_major_formatter(DateFormatter('%a %b-%d'))
ax.xaxis.set_minor_locator(DayLocator(interval=1))
ax.xaxis.set_major_locator(WeekdayLocator(MONDAY))
plot = plt.bar(nonrainy_weather['datetime'],nonrainy_weather['ENTRIESn_hourly'], \
label='non-rainy', color='r', width=0.2)
plot = plt.bar(rainy_weather['datetime'],rainy_weather['ENTRIESn_hourly'], \
label='rainy', color='b', width=0.2)
handles, labels = ax.get_legend_handles_labels()
ax.legend(handles, labels)
return plot
weather_data = pd.read_csv('turnstile_weather_v2.csv')
plot_entries_per_hour2(weather_data)